package com.alibaba.zihao.service;

import cn.edu.hfut.dmic.webcollector.crawler.BreadthCrawler;
import cn.edu.hfut.dmic.webcollector.model.Page;

import java.io.IOException;
import java.util.regex.Pattern;

/**
 * Created by jason on 3/14/15.
 */
public class ZhihuCrawler extends BreadthCrawler{
    ZihaoWrite writeZihao = new ZihaoWrite();
    @Override
    public void visit(Page page){
        String question_regex="^http://www.zhihu.com/question/[0-9]+";
        if(Pattern.matches(question_regex, page.getUrl())){
            System.out.println("processing "+page.getUrl());

            /*extract title of the page*/
            String title=page.getDoc().title();
            try {


                writeZihao.write("tile+++++++++++++++++++++++++++++++++++++++" + "\n"+ "\n");
                writeZihao.write(title + "\n" + "\n");
//            System.out.println(title);

            /*extract the content of question*/
                String question = page.getDoc().select("div[id=zh-question-detail]").text();
//            System.out.println(question);
                writeZihao.write("answer+++++++++++++++++++++++++++++++++++++++" + "\n" + "\n");
                writeZihao.write(question + "\n" + "\n");
            }catch (IOException e){
                System.out.println(e);
            }

        }
    }
}
